{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Monte Carlo Option pricing using Python libraries\n",
    "\n",
    "\n",
    "Due to the complicated nature of the barrier and price algorithmic averaging, there is no analytical solution for this example of [exotic option](https://www.investopedia.com/terms/e/exoticoption.asp). We can use the Monte Carlo simulation method to estimate the expected value of profit on the maturity day. Traditionally, Monte Carlo pricing is done in the C/C++ CUDA code. In this notebook, we will show this can be done efficiently in the Python libraries like Numba and CuPy.\n",
    "\n",
    "Following are the parameters we choose to price the example Asian Barrier Option:\n",
    "\n",
    "    Maturity (T): 1 year\n",
    "    Spot (S) : 120\n",
    "    Strike (K): 110\n",
    "    Volatility (sigma): 35.0 %\n",
    "    Risk Free Rate (r): 5.0 %\n",
    "    Stock Drift Rate (mu): 10.0 %\n",
    "    Barrier (B): 100\n",
    "    \n",
    "To run this notebook successfully, it is advised to use GPUs with at least 16G memory. V100 GPUs are recommended.\n",
    "\n",
    "### CUDA Monte Carlo Option Pricing\n",
    "\n",
    "Traditionally, the Monte Caro Option pricing is implemented in CUDA C/C++. Following is one example,"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "```C\n",
       "#include <vector>\n",
       "#include <stdio.h>\n",
       "#include <iostream>\n",
       "#include <chrono>\n",
       "#include <cuda_runtime.h>\n",
       "#include <helper_cuda.h>\n",
       "#include <curand.h>\n",
       " \n",
       "#define CHECKCURAND(expression)                         \\\n",
       "  {                                                     \\\n",
       "    curandStatus_t status = (expression);                         \\\n",
       "    if (status != CURAND_STATUS_SUCCESS) {                        \\\n",
       "      std::cerr << \"Curand Error on line \" << __LINE__<< std::endl;     \\\n",
       "      std::exit(EXIT_FAILURE);                                          \\\n",
       "    }                                                                   \\\n",
       "  }\n",
       "\n",
       "// atomicAdd is introduced for compute capability >=6.0\n",
       "#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600\n",
       "#else\n",
       "__device__ double atomicAdd(double* address, double val)\n",
       "{\n",
       "      printf(\"device arch <=600\\n\");\n",
       "        unsigned long long int* address_as_ull = (unsigned long long int*)address;\n",
       "          unsigned long long int old = *address_as_ull, assumed;\n",
       "            do {\n",
       "                    assumed = old;\n",
       "                        old = atomicCAS(address_as_ull, assumed,\n",
       "                                                    __double_as_longlong(val + __longlong_as_double(assumed)));\n",
       "                          } while (assumed != old);\n",
       "              return __longlong_as_double(old);\n",
       "}\n",
       "#endif\n",
       "\n",
       "__global__ void sumPayoffKernel(float *d_s, const unsigned N_PATHS, double *mysum)\n",
       "{\n",
       "  unsigned idx =  threadIdx.x + blockIdx.x * blockDim.x;\n",
       "  unsigned stride = blockDim.x * gridDim.x;\n",
       "  unsigned tid = threadIdx.x;\n",
       "\n",
       "  extern __shared__ double smdata[];\n",
       "  smdata[tid] = 0.0;\n",
       "\n",
       "  for (unsigned i = idx; i<N_PATHS; i+=stride)\n",
       "  {\n",
       "    smdata[tid] += (double) d_s[i];\n",
       "  }\n",
       "\n",
       "  for (unsigned s=blockDim.x/2; s>0; s>>=1)\n",
       "  {\n",
       "    __syncthreads();\n",
       "    if (tid < s) smdata[tid] += smdata[tid + s];\n",
       "  }\n",
       "\n",
       "  if (tid == 0)\n",
       "  {\n",
       "    atomicAdd(mysum, smdata[0]);\n",
       "  }\n",
       "}\n",
       "\n",
       "__global__ void barrier_option(\n",
       "    float *d_s,\n",
       "    const float T,\n",
       "    const float K,\n",
       "    const float B,\n",
       "    const float S0,\n",
       "    const float sigma,\n",
       "    const float mu,\n",
       "    const float r,\n",
       "    const float * d_normals,\n",
       "    const long N_STEPS,\n",
       "    const long N_PATHS)\n",
       "{\n",
       "  unsigned idx =  threadIdx.x + blockIdx.x * blockDim.x;\n",
       "  unsigned stride = blockDim.x * gridDim.x;\n",
       "  const float tmp1 = mu*T/N_STEPS;\n",
       "  const float tmp2 = exp(-r*T);\n",
       "  const float tmp3 = sqrt(T/N_STEPS);\n",
       "  double running_average = 0.0;\n",
       "\n",
       "  for (unsigned i = idx; i<N_PATHS; i+=stride)\n",
       "  {\n",
       "    float s_curr = S0;\n",
       "    for(unsigned n = 0; n < N_STEPS; n++){\n",
       "       s_curr += tmp1 * s_curr + sigma*s_curr*tmp3*d_normals[i + n * N_PATHS];\n",
       "       running_average += (s_curr - running_average) / (n + 1.0) ;\n",
       "       if (running_average <= B){\n",
       "           break;\n",
       "       }\n",
       "    }\n",
       "\n",
       "    float payoff = (running_average>K ? running_average-K : 0.f);\n",
       "    d_s[i] = tmp2 * payoff;\n",
       "  }\n",
       "}\n",
       "\n",
       "int main(int argc, char *argv[]) {\n",
       "  try {\n",
       "    // declare variables and constants\n",
       "    size_t N_PATHS = 8192000;\n",
       "    size_t N_STEPS = 365;\n",
       "    if (argc >= 2)  N_PATHS = atoi(argv[1]);\n",
       "\n",
       "    if (argc >= 3)  N_STEPS = atoi(argv[2]);\n",
       "\n",
       "    const float T = 1.0f;\n",
       "    const float K = 110.0f;\n",
       "    const float B = 100.0f;\n",
       "    const float S0 = 120.0f;\n",
       "    const float sigma = 0.35f;\n",
       "    const float mu = 0.1f;\n",
       "    const float r = 0.05f;\n",
       "\n",
       "\n",
       "    double gpu_sum{0.0};\n",
       "\n",
       "    int devID{0};\n",
       "    cudaDeviceProp deviceProps;\n",
       "\n",
       "    checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));\n",
       "    printf(\"CUDA device [%s]\\n\", deviceProps.name);\n",
       "    printf(\"GPU Device %d: \\\"%s\\\" with compute capability %d.%d\\n\\n\", devID, deviceProps.name, deviceProps.major, deviceProps.minor);\n",
       "    // Generate random numbers on the device\n",
       "    curandGenerator_t curandGenerator;\n",
       "    CHECKCURAND(curandCreateGenerator(&curandGenerator, CURAND_RNG_PSEUDO_MTGP32));\n",
       "    CHECKCURAND(curandSetPseudoRandomGeneratorSeed(curandGenerator, 1234ULL)) ;\n",
       "\n",
       "    const size_t N_NORMALS = (size_t)N_STEPS * N_PATHS;\n",
       "    float *d_normals;\n",
       "    checkCudaErrors(cudaMalloc(&d_normals, N_NORMALS * sizeof(float)));\n",
       "    CHECKCURAND(curandGenerateNormal(curandGenerator, d_normals, N_NORMALS, 0.0f, 1.0f));\n",
       "    cudaDeviceSynchronize();\n",
       "\n",
       "  \t// before kernel launch, check the max potential blockSize\n",
       "  \tint BLOCK_SIZE, GRID_SIZE;\n",
       "  \tcheckCudaErrors(cudaOccupancyMaxPotentialBlockSize(&GRID_SIZE,\n",
       "  \t                                                   &BLOCK_SIZE,\n",
       "  \t                                                   barrier_option,\n",
       "  \t                                                   0, N_PATHS));\n",
       "\n",
       "  \tstd::cout << \"suggested block size \" << BLOCK_SIZE\n",
       "  \t          << \" \\nsuggested grid size \" << GRID_SIZE\n",
       "  \t          << std::endl;\n",
       "\n",
       "  \tstd::cout << \"Used grid size \" << GRID_SIZE << std::endl;\n",
       "\n",
       "  \t// Kernel launch\n",
       "  \tauto t1=std::chrono::high_resolution_clock::now();\n",
       "\n",
       "  \tfloat *d_s;\n",
       "  \tcheckCudaErrors(cudaMalloc(&d_s, N_PATHS*sizeof(float)));\n",
       "\n",
       "  \tauto t3=std::chrono::high_resolution_clock::now();\n",
       "  \tbarrier_option<<<GRID_SIZE, BLOCK_SIZE>>>(d_s, T, K, B, S0, sigma, mu, r, d_normals, N_STEPS, N_PATHS);\n",
       "  \tcudaDeviceSynchronize();\n",
       "  \tauto t4=std::chrono::high_resolution_clock::now();\n",
       "\n",
       "  \tdouble* mySum;\n",
       "  \tcheckCudaErrors(cudaMallocManaged(&mySum, sizeof(double)));\n",
       "  \tsumPayoffKernel<<<GRID_SIZE, BLOCK_SIZE, BLOCK_SIZE*sizeof(double)>>>(d_s, N_PATHS, mySum);\n",
       "  \tcudaDeviceSynchronize();\n",
       "  \tauto t5=std::chrono::high_resolution_clock::now();\n",
       "\n",
       "  \tstd::cout << \"sumPayoffKernel takes \"\n",
       "  \t          << std::chrono::duration_cast<std::chrono::microseconds>(t5-t4).count() / 1000.f\n",
       "  \t          << \" ms\\n\";\n",
       "\n",
       "  \tgpu_sum = mySum[0] / N_PATHS;\n",
       "\n",
       "  \tauto t2=std::chrono::high_resolution_clock::now();\n",
       "\n",
       "  \t// clean up\n",
       "  \tCHECKCURAND(curandDestroyGenerator( curandGenerator )) ;\n",
       "  \tcheckCudaErrors(cudaFree(d_s));\n",
       "  \tcheckCudaErrors(cudaFree(d_normals));\n",
       "  \tcheckCudaErrors(cudaFree(mySum));\n",
       "\n",
       "  \tstd::cout << \"price \"\n",
       "              << gpu_sum\n",
       "              << \" time \"\n",
       "  \t          << std::chrono::duration_cast<std::chrono::microseconds>(t5-t1).count() / 1000.f\n",
       "  \t          << \" ms\\n\";\n",
       "  }\n",
       "\n",
       "  catch(std::\n",
       "        exception& e)\n",
       "  {\n",
       "    std::cout<< \"exception: \" << e.what() << \"\\n\";\n",
       "  }\n",
       "}\n",
       "\n",
       "   ```"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from IPython.display import Markdown as md\n",
    "f = open('cuda_pricing.cu', 'r')\n",
    "md(\"\"\"```C\n",
    "%s\n",
    "   ```\"\"\" % (f.read()))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The CUDA code is usually long and detailed. In general, it is performing a sequence of 5 tasks:\n",
    "1. Allocate GPU memory to store the random number and simulation path results\n",
    "2. Call cuRand library to generate random numbers\n",
    "3. Launch the barrier option kernel to do parallel simulations\n",
    "4. Launch the sum kernel to aggregate the terminal derivative prices.\n",
    "5. Deallocate the memory\n",
    "\n",
    "Developers have to perform each step explicitly. \n",
    "\n",
    "Compile and run the code:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "make: 'out' is up to date.\n",
      "CUDA device [Tesla V100-SXM2-16GB]\n",
      "GPU Device 0: \"Tesla V100-SXM2-16GB\" with compute capability 7.0\n",
      "\n",
      "suggested block size 1024 \n",
      "suggested grid size 160\n",
      "Used grid size 160\n",
      "sumPayoffKernel takes 1.259 ms\n",
      "price 18.7026 time 23.123 ms\n"
     ]
    }
   ],
   "source": [
    "!make out\n",
    "!./out"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Compiling and running this CUDA code on a V100 GPU produces the correct option price $18.70$ in $22.05ms$ for $8.192$ million paths and $365$ steps. We will use these numbers as our reference benchmark for later comparison. Among the 5 steps, the critical component is step 3, where data scientists need to describe the detailed Monte Carlo simulation. Ideally the data scientists efforts should be focused on this step. \n",
    "\n",
    "## Python Monte Carlo Option Pricing\n",
    "We set the constants for the option and load the necessary libraries:-"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import cupy\n",
    "import numpy as np\n",
    "import math\n",
    "import time\n",
    "import numba\n",
    "from numba import cuda\n",
    "from numba import njit\n",
    "from numba import prange\n",
    "import cudf\n",
    "cupy.cuda.set_allocator(None)\n",
    "\n",
    "N_PATHS = 8192000\n",
    "N_STEPS = 365\n",
    "T = 1.0\n",
    "K = 110.0\n",
    "B = 100.0\n",
    "S0 = 120.0\n",
    "sigma = 0.35\n",
    "mu = 0.1\n",
    "r = 0.05"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "As we know the [Standard Error of the Mean](https://en.wikipedia.org/wiki/Standard_error) is proportional to the inversed square root of the number of samples. Hence the more simulation paths we have, the more accurate the pricing will be. We will simulate $8.192$ million paths with $365$ steps where each step represents a day. \n",
    "\n",
    "#### Single Thread CPU\n",
    "The single thread CPU code for the Monte Carlo simulation has two nested for-loops. The outer loop iterates each path while the inner loop iterates time and computes the underlying asset price for that day. Note that this code is accelerated via [Numba @jit](http://numba.pydata.org/) hence it compiles into machine code at runtime. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "@njit(fastmath=True)\n",
    "def cpu_barrier_option(d_s, T, K, B, S0, sigma, mu, r, d_normals, N_STEPS, N_PATHS):\n",
    "    tmp1 = mu*T/N_STEPS\n",
    "    tmp2 = math.exp(-r*T)\n",
    "    tmp3 = math.sqrt(T/N_STEPS)\n",
    "    running_average = 0.0\n",
    "    for i in range(N_PATHS):\n",
    "        s_curr = S0\n",
    "        for n in range(N_STEPS):\n",
    "            s_curr += tmp1 * s_curr + sigma*s_curr*tmp3*d_normals[i + n * N_PATHS]\n",
    "            running_average = running_average + 1.0/(n + 1.0) * (s_curr - running_average)\n",
    "            if running_average <= B:\n",
    "                break\n",
    "\n",
    "        payoff = running_average - K if running_average>K else 0\n",
    "        d_s[i] = tmp2 * payoff"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "  We use CuPy to generate Gaussian random numbers in the GPU and allocate an array to store the prices at maturity.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "randoms_gpu = cupy.random.normal(0, 1, N_PATHS * N_STEPS, dtype=cupy.float32)\n",
    "randoms_cpu = np_randoms = cupy.asnumpy(randoms_gpu)\n",
    "output =  np.zeros(N_PATHS, dtype=np.float32)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now we will run the Monte Carlo simulation and time it. When the Numba accelerated function is called for the first time, there is some overhead to compile it. So to time it accurately, we run this method twice and and consider the run time of the second attempt. \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "time 27.778984785079956 v 18.716661\n"
     ]
    }
   ],
   "source": [
    "cpu_barrier_option(output, np.float32(T), np.float32(K), \n",
    "                    np.float32(B), np.float32(S0), \n",
    "                    np.float32(sigma), np.float32(mu), \n",
    "                    np.float32(r), randoms_cpu, N_STEPS, N_PATHS)\n",
    "s = time.time()\n",
    "cpu_barrier_option(output, np.float32(T), np.float32(K), \n",
    "                    np.float32(B), np.float32(S0), \n",
    "                    np.float32(sigma), np.float32(mu), \n",
    "                    np.float32(r), randoms_cpu, N_STEPS, N_PATHS)\n",
    "v = output.mean()\n",
    "e = time.time()\n",
    "print('time', e-s, 'v', v)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Multiple Cores CPU\n",
    "CPU has multiple cores and to make a fair comparison, the code can be modified a little to take advantage of all the CPU cores. Note how we parallelize the outer loop:-"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "@njit(fastmath=True, parallel=True)\n",
    "def cpu_multiplecore_barrier_option(d_s, T, K, B, S0, sigma, mu, r, d_normals, N_STEPS, N_PATHS):\n",
    "    tmp1 = mu*T/N_STEPS\n",
    "    tmp2 = math.exp(-r*T)\n",
    "    tmp3 = math.sqrt(T/N_STEPS)\n",
    "    for i in prange(N_PATHS):\n",
    "        s_curr = S0\n",
    "        running_average = 0.0\n",
    "        for n in range(N_STEPS):\n",
    "            s_curr += tmp1 * s_curr + sigma*s_curr*tmp3*d_normals[i + n * N_PATHS]\n",
    "            running_average = running_average + 1.0/(n + 1.0) * (s_curr - running_average)\n",
    "            if running_average <= B:\n",
    "                break\n",
    "        payoff = running_average - K if running_average>K else 0\n",
    "        d_s[i] = tmp2 * payoff"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Running this parallel code and timing it:-"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "time 1.3648085594177246 v 18.716661\n"
     ]
    }
   ],
   "source": [
    "cpu_multiplecore_barrier_option(output, np.float32(T), np.float32(K), \n",
    "                    np.float32(B), np.float32(S0), \n",
    "                    np.float32(sigma), np.float32(mu), \n",
    "                    np.float32(r), randoms_cpu, N_STEPS, N_PATHS)\n",
    "s = time.time()\n",
    "cpu_multiplecore_barrier_option(output, np.float32(T), np.float32(K), \n",
    "                    np.float32(B), np.float32(S0), \n",
    "                    np.float32(sigma), np.float32(mu), \n",
    "                    np.float32(r), randoms_cpu, N_STEPS, N_PATHS)\n",
    "v = output.mean()\n",
    "e = time.time()\n",
    "print('time', e-s, 'v', v)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We see approximately $32x$ speedup due to $32$ cores of the CPU. \n",
    "\n",
    "#### NUMBA GPU\n",
    "The multiple cores CPU code can be modified easily to run in the GPU via Numba.cuda.jit. The code below is very similar to the CPU multiple core code except that we parallelize the outer loop on the GPU. Running this code and timing it:-"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "@cuda.jit\n",
    "def numba_gpu_barrier_option(d_s, T, K, B, S0, sigma, mu, r, d_normals, N_STEPS, N_PATHS):\n",
    "    # ii - overall thread index\n",
    "    ii = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x\n",
    "    stride = cuda.gridDim.x * cuda.blockDim.x\n",
    "    tmp1 = mu*T/N_STEPS\n",
    "    tmp2 = math.exp(-r*T)\n",
    "    tmp3 = math.sqrt(T/N_STEPS)\n",
    "    running_average = 0.0\n",
    "    for i in range(ii, N_PATHS, stride):\n",
    "        s_curr = S0\n",
    "        for n in range(N_STEPS):\n",
    "            s_curr += tmp1 * s_curr + sigma*s_curr*tmp3*d_normals[i + n * N_PATHS]\n",
    "            running_average += (s_curr - running_average) / (n + 1.0)\n",
    "            if running_average <= B:\n",
    "                break\n",
    "        payoff = running_average - K if running_average>K else 0\n",
    "        d_s[i] = tmp2 * payoff"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "time 0.062163591384887695 v 18.716661\n"
     ]
    }
   ],
   "source": [
    "\n",
    "number_of_threads = 256\n",
    "number_of_blocks = (N_PATHS-1) // number_of_threads + 1\n",
    "output = cupy.zeros(N_PATHS, dtype=cupy.float32)\n",
    "numba_gpu_barrier_option[(number_of_blocks,), (number_of_threads,)](output, np.float32(T), np.float32(K), \n",
    "                    np.float32(B), np.float32(S0), \n",
    "                    np.float32(sigma), np.float32(mu), \n",
    "                    np.float32(r), randoms_gpu, N_STEPS, N_PATHS)\n",
    "s = time.time()\n",
    "numba_gpu_barrier_option[(number_of_blocks,), (number_of_threads,)](output, np.float32(T), np.float32(K), \n",
    "                    np.float32(B), np.float32(S0), \n",
    "                    np.float32(sigma), np.float32(mu), \n",
    "                    np.float32(r), randoms_gpu, N_STEPS, N_PATHS)\n",
    "v = output.mean()\n",
    "cuda.synchronize()\n",
    "e = time.time()\n",
    "print('time', e-s, 'v', v)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We get $4x$ speedup compared to the multiple cores version and $128x$ speedup compared to the single core version. \n",
    "\n",
    "#### NUMBA Shared Memory \n",
    "While accessing the global memory for Gaussian random numbers, the memory access is already aligned and numbers are only read once. So using shared memory is not helping the performance as shown below:-"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "@cuda.jit\n",
    "def numba_gpu_barrier_option_shared_mem(d_s, T, K, B, S0, sigma, mu, r, d_normals, N_STEPS, N_PATHS):\n",
    "    shared = cuda.shared.array(shape=0, dtype=numba.float32)\n",
    "    # load to shared memory\n",
    "    path_offset = cuda.blockIdx.x * cuda.blockDim.x\n",
    "    ii = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x\n",
    "    stride = cuda.gridDim.x * cuda.blockDim.x\n",
    "    tmp1 = mu*T/N_STEPS\n",
    "    tmp2 = math.exp(-r*T)\n",
    "    tmp3 = math.sqrt(T/N_STEPS)\n",
    "    running_average = 0.0\n",
    "    for i in range(ii, N_PATHS, stride):\n",
    "        s_curr = S0\n",
    "        for n in range(N_STEPS):\n",
    "            shared[cuda.threadIdx.x] = d_normals[path_offset + cuda.threadIdx.x + n * N_PATHS]\n",
    "            s_curr += tmp1 * s_curr + sigma*s_curr*tmp3*shared[cuda.threadIdx.x]\n",
    "            running_average += (s_curr - running_average) / (n + 1.0)\n",
    "            if running_average <= B:\n",
    "                break\n",
    "        payoff = running_average - K if running_average>K else 0\n",
    "        d_s[i] = tmp2 * payoff"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "time 0.06269669532775879 v 18.716661\n"
     ]
    }
   ],
   "source": [
    "number_of_threads = 256\n",
    "number_of_blocks = (N_PATHS-1) // number_of_threads + 1\n",
    "output = cupy.zeros(N_PATHS, dtype=cupy.float32)\n",
    "shared_buffer_size = number_of_threads * 4\n",
    "numba_gpu_barrier_option_shared_mem[(number_of_blocks,), (number_of_threads,), 0, shared_buffer_size](output, np.float32(T), np.float32(K), \n",
    "                    np.float32(B), np.float32(S0), \n",
    "                    np.float32(sigma), np.float32(mu), \n",
    "                    np.float32(r), randoms_gpu, N_STEPS, N_PATHS)\n",
    "s = time.time()\n",
    "numba_gpu_barrier_option_shared_mem[(number_of_blocks,), (number_of_threads,), 0, shared_buffer_size](output, np.float32(T), np.float32(K), \n",
    "                    np.float32(B), np.float32(S0), \n",
    "                    np.float32(sigma), np.float32(mu), \n",
    "                    np.float32(r), randoms_gpu, N_STEPS, N_PATHS)\n",
    "v = output.mean()\n",
    "cuda.synchronize()\n",
    "e = time.time()\n",
    "print('time', e-s, 'v', v)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### CUPY GPU\n",
    "CuPy provides an easy way to define GPU kernels from raw CUDA source. `RawKernel` object allows you to call the kernel with CUDA’s `cuLaunchKernel` interface. Here is an example where we wrap the Barrier Option computation code inside the `RawKernel`:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "cupy_barrier_option = cupy.RawKernel(r'''\n",
    "extern \"C\" __global__ void barrier_option(\n",
    "    float *d_s,\n",
    "    const float T,\n",
    "    const float K,\n",
    "    const float B,\n",
    "    const float S0,\n",
    "    const float sigma,\n",
    "    const float mu,\n",
    "    const float r,\n",
    "    const float * d_normals,\n",
    "    const long N_STEPS,\n",
    "    const long N_PATHS)\n",
    "{\n",
    "  unsigned idx =  threadIdx.x + blockIdx.x * blockDim.x;\n",
    "  unsigned stride = blockDim.x * gridDim.x;\n",
    "  unsigned tid = threadIdx.x;\n",
    "\n",
    "  const float tmp1 = mu*T/N_STEPS;\n",
    "  const float tmp2 = exp(-r*T);\n",
    "  const float tmp3 = sqrt(T/N_STEPS);\n",
    "  double running_average = 0.0;\n",
    "\n",
    "  for (unsigned i = idx; i<N_PATHS; i+=stride)\n",
    "  {\n",
    "    float s_curr = S0;\n",
    "    unsigned n=0;\n",
    "    for(unsigned n = 0; n < N_STEPS; n++){\n",
    "       s_curr += tmp1 * s_curr + sigma*s_curr*tmp3*d_normals[i + n * N_PATHS];\n",
    "       running_average += (s_curr - running_average) / (n + 1.0) ;\n",
    "       if (running_average <= B){\n",
    "           break;\n",
    "       }\n",
    "    }\n",
    "\n",
    "    float payoff = (running_average>K ? running_average-K : 0.f);\n",
    "    d_s[i] = tmp2 * payoff;\n",
    "  }\n",
    "}\n",
    "\n",
    "''', 'barrier_option')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can launch it to compute the same Barrier Option price:-"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "time 0.025580167770385742 v 18.716661\n"
     ]
    }
   ],
   "source": [
    "number_of_threads = 256\n",
    "number_of_blocks = (N_PATHS-1) // number_of_threads + 1\n",
    "s = time.time()\n",
    "cupy_barrier_option((number_of_blocks,), (number_of_threads,),\n",
    "                   (output, np.float32(T), np.float32(K), \n",
    "                    np.float32(B), np.float32(S0), \n",
    "                    np.float32(sigma), np.float32(mu), \n",
    "                    np.float32(r),  randoms_gpu, N_STEPS, N_PATHS))\n",
    "v = output.mean()\n",
    "cupy.cuda.stream.get_current_stream().synchronize()\n",
    "e = time.time()\n",
    "print('time', e-s, 'v',v)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This approach is the most efficient way to use the GPU and it achieves 8x speedup compared to the 32 core CPU performance. Compared with CUDA C/C++ approach ($23ms$), CuPy performance ($25ms$) is very close to it."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Multiple GPUs Option Pricing"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To get a more accurate estimation of the option price, more paths are needed for Monte Carlo simulation. The single V100 GPU we used in the above example only has 32GB memory and we are hitting the memory limits to run 8M simulations. [DASK](https://dask.org/) is an integrated component of RAPIDS for distributed computation on GPUs.  We can take advantage of it to distribute the Monte Carlo simulation computation to multiple nodes across multiple GPUs. First, we need to wrap all the computation inside a function to allow the allocated GPU memory to be released at the end of the function call. Note that the function takes an extra argument for the random number seed value so the individual function calls each have an independent sequence of random numbers. Loading the DASK library and setting up the local CUDA cluster :-"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# clear the GPU memory\n",
    "del randoms_gpu \n",
    "del randoms_cpu\n",
    "del output\n",
    "\n",
    "\n",
    "def get_option_price(T, K, B, S0, sigma, mu, r, N_PATHS = 8192000, N_STEPS = 365, seed=3):\n",
    "    number_of_threads = 256\n",
    "    number_of_blocks = (N_PATHS-1) // number_of_threads + 1\n",
    "    cupy.random.seed(seed)\n",
    "    randoms_gpu = cupy.random.normal(0, 1, N_PATHS * N_STEPS, dtype=cupy.float32)\n",
    "    output =  cupy.zeros(N_PATHS, dtype=cupy.float32)\n",
    "    cupy_barrier_option((number_of_blocks,), (number_of_threads,),\n",
    "                   (output, np.float32(T), np.float32(K), \n",
    "                    np.float32(B), np.float32(S0), \n",
    "                    np.float32(sigma), np.float32(mu), \n",
    "                    np.float32(r),  randoms_gpu, N_STEPS, N_PATHS))\n",
    "    v = output.mean()\n",
    "    out_df = cudf.DataFrame()\n",
    "    out_df['p'] = cudf.Series([v.item()], nan_as_null=False)\n",
    "    return out_df\n",
    "o = get_option_price(T=1.0, K=120.0, B=90.0, S0=100.0, sigma=0.2, mu=0.1, r=0.05)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table style=\"border: 2px solid white;\">\n",
       "<tr>\n",
       "<td style=\"vertical-align: top; border: 0px solid white\">\n",
       "<h3 style=\"text-align: left;\">Client</h3>\n",
       "<ul style=\"text-align: left; list-style: none; margin: 0; padding: 0;\">\n",
       "  <li><b>Scheduler: </b>tcp://127.0.0.1:34615</li>\n",
       "  <li><b>Dashboard: </b><a href='http://127.0.0.1:8787/status' target='_blank'>http://127.0.0.1:8787/status</a>\n",
       "</ul>\n",
       "</td>\n",
       "<td style=\"vertical-align: top; border: 0px solid white\">\n",
       "<h3 style=\"text-align: left;\">Cluster</h3>\n",
       "<ul style=\"text-align: left; list-style:none; margin: 0; padding: 0;\">\n",
       "  <li><b>Workers: </b>8</li>\n",
       "  <li><b>Cores: </b>8</li>\n",
       "  <li><b>Memory: </b>540.94 GB</li>\n",
       "</ul>\n",
       "</td>\n",
       "</tr>\n",
       "</table>"
      ],
      "text/plain": [
       "<Client: 'tcp://127.0.0.1:34615' processes=8 threads=8, memory=540.94 GB>"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import dask\n",
    "import dask_cudf\n",
    "from dask.delayed import delayed\n",
    "from dask_cuda import LocalCUDACluster\n",
    "cluster = LocalCUDACluster()\n",
    "from dask.distributed import Client\n",
    "client = Client(cluster)\n",
    "client"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "There are 4 GPUs inside the system. To distribute the above function, we wrap it into the `delayed` function to integrate it into the DASK computation graph. We use `from_delayed` to gather all the distributed dataframes into a holistic cudf_dask dataframe. We can call the cudf_dask dataframe `mean` and `std` to calculate the expected mean and standard deviation of the prices."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "x = dask_cudf.from_delayed([delayed(get_option_price)(T=1.0, K=110.0, B=100.0, S0=120.0, sigma=0.35, mu=0.1, r=0.05, seed=3000+i) for i in range(1600)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "p    18.711432\n",
       "dtype: float64"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x.mean().compute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "p    0.007374\n",
       "dtype: float64"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x.std().compute()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The code computed 1600 Monte Carlo simulations of `8192000` paths. By averaging the price together to get a better estimation, the standard deviation is reduced by a factor of 1/sqrt(1600) = 1/40  "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
